Summary report on prophage clustering

Author

Lakhansing Pardeshi

Published

May 23, 2024


Initial setup

Code
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(skimr))
suppressPackageStartupMessages(library(ComplexHeatmap))
suppressPackageStartupMessages(library(ggdist))
suppressPackageStartupMessages(library(ggtree))
suppressPackageStartupMessages(library(org.Pectobacterium.spp.pan.eg.db))

rm(list = ls())

source("https://raw.githubusercontent.com/lakhanp1/omics_utils/main/RScripts/utils.R")
source("scripts/utils/config_functions.R")
source("scripts/utils/phylogeny_functions.R")
source("scripts/utils/compare_hg_sets.R")
################################################################################
set.seed(124)

confs <- prefix_config_paths(
  conf = suppressWarnings(configr::read.config(file = "project_config.yaml")),
  dir = "."
)

treeMethod <- "core_snp_ml"     #ani_upgma, kmer_nj, core_snp_ml

pangenome <- confs$data$pangenomes$pectobacterium.v2$name
panConf <- confs$data$pangenomes[[pangenome]]

outDir <- confs$analysis$prophages$summary$dir

panOrgDb <- org.Pectobacterium.spp.pan.eg.db

Import data

Code
sampleInfo <- get_metadata(file = panConf$files$metadata, genus = confs$genus)

sampleInfoList <- as.list_metadata(
  df = sampleInfo, sampleId, sampleName, SpeciesName, strain, nodeLabs, genomeId
)

speciesOrder <- suppressMessages(
  readr::read_tsv(confs$analysis$phylogeny[[treeMethod]]$files$species_order)
)

## read tree
rawTree <- import_tree(
  confs$analysis$phylogeny[[treeMethod]]$files$tree_rooted,
  phylo = TRUE
)

rawTree <- representative_genomes_tree(phy = rawTree, metadata = sampleInfo)

Import phage clustering data

Code
phageAn <- suppressMessages(
  readr::read_tsv(confs$data$prophages$files$data)
) %>%
  dplyr::select(prophage_id, completeness, checkv_quality, taxonomy)

# read prophage HGs stored locally
proHgs <- suppressMessages(
  readr::read_tsv(confs$analysis$prophages$preprocessing$files$raw_prophage_hg)
) %>%
  dplyr::select(prophage_id = id, nHgs, hgs) %>%
  dplyr::mutate(
    hgs = stringr::str_split(hgs, ";")
  )

proHgL <- purrr::transpose(proHgs) %>%
  purrr::set_names(nm = purrr::map(., "prophage_id"))

Prophage data processing summary

Code
phageClusters <- suppressMessages(
  readr::read_tsv(confs$analysis$prophages$files$clusters)
)

clusterList <- dplyr::group_by(phageClusters, phage_grp) %>% 
  dplyr::group_map(
    .f = ~{
      list(
        phage_grp = .x$phage_grp[1],
        members = .x$prophage_id,
        group_size = nrow(.x)
      )
    },
    .keep = TRUE
  ) %>% 
  purrr::set_names(nm = purrr::map(., "phage_grp"))

Clustering

Number of clusters/prophage signatures: 436

Annotate prophages with defense systems

Code
defense_hgs <- suppressMessages(
  readr::read_tsv(confs$analysis$prophages$files$hg_broad_functions)
) %>% 
  dplyr::filter(
    function_category %in% 
      c("toxin-antitoxin", "restriction modification", "methyltransferase")
  ) %>%
  dplyr::mutate(
    function_category = dplyr::case_match(
      function_category,
      "toxin-antitoxin" ~ "ta_system",
      "restriction modification" ~ "rm_system",
      "methyltransferase" ~ "rm_system",
      .default = function_category
    )
  ) %>% 
  dplyr::left_join(
    y = tidyr::unnest(proHgs, cols = hgs),
    by = c("hg_id" = "hgs")
  ) %>% 
  dplyr::summarise(
    hgs = paste(sort(hg_id), collapse = ";"),
    .by = c(prophage_id, function_category)
  ) %>% 
  tidyr::pivot_wider(
    names_from = function_category,
    values_from = hgs
  )

phage_defense <- dplyr::select(phageClusters, prophage_id, phage_grp, fragments, grp_size) %>% 
  tidyr::separate_longer_delim(cols = fragments, delim = ";") %>% 
  dplyr::left_join(
    y = defense_hgs, by = c("fragments" = "prophage_id")
  ) %>% 
  dplyr::filter(!is.na(ta_system) | !is.na(rm_system)) %>% 
  dplyr::distinct(prophage_id, phage_grp, ta_system, rm_system, grp_size) %>% 
  dplyr::arrange(desc(grp_size))

readr::write_tsv(
  x = phage_defense,
  file = confs$analysis$prophages$files$phage_defense
)

Prophage overlap between species

Code
n_genomes_phage <- dplyr::distinct(phageClusters, genomeId, SpeciesName) %>%
  dplyr::count(SpeciesName, name = "n_genomes_phages", sort = TRUE)

clustWiseCounts <- dplyr::group_by(phageClusters, phage_grp, SpeciesName) %>% 
  dplyr::summarise(n_genomes_clust = n(), .groups = "drop") %>% 
  dplyr::add_count(phage_grp, name = "n_species_phages", sort = TRUE) %>% 
  dplyr::left_join(
    y = n_genomes_phage, by = "SpeciesName"
  ) %>% 
  dplyr::mutate(
    SpeciesName = forcats::fct_relevel(SpeciesName, !!!speciesOrder$SpeciesName),
    SpeciesName = forcats::fct_drop(SpeciesName),
    phage_grp = forcats::as_factor(phage_grp),
    cluster_per_species = n_genomes_clust / n_genomes_phages
  )

Generalist and specialist prophages statistics:

Code
generalist <- dplyr::filter(clustWiseCounts, n_species_phages > 1) %>% 
  dplyr::mutate(
    dplyr::across(
      .cols = c(phage_grp, SpeciesName), .fns = forcats::fct_drop
    ),
    category = "generalist"
  )

specialist <- dplyr::filter(clustWiseCounts, n_species_phages == 1) %>% 
  dplyr::arrange(dplyr::desc(n_genomes_clust)) %>% 
  dplyr::mutate(category = "specialist")

Generalist prophage signatures: 54

Specialist prophage signatures: 382

Distribution of number of species in which prophage signatures are found:

Prophage target species number histogram

Code
pt_hist <- dplyr::distinct(clustWiseCounts, phage_grp, n_species_phages) %>% 
  ggplot2::ggplot(
    mapping = aes(x = n_species_phages)
  ) +
  ggplot2::geom_histogram(binwidth = 1, color = "black", fill = "black") +
  ggbreak::scale_y_break(
    breaks = c(40, 360),
    expand = expansion(mult = c(0, 0.05))
  ) +
  ggplot2::labs(
    x = "Prophage in multiple species",
    y = "Count"
  ) +
  scale_x_continuous(expand = expansion(mult = c(0, 0.05))) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
  theme_bw(base_size = 24) +
  theme(
    panel.grid = element_blank(),
    axis.ticks.y.right = element_blank(),
    axis.text.y.right = element_blank()
  )

Species wise generalist and specialist prophage signatures

Code
# sort(table(specialist$SpeciesName), decreasing = TRUE)
pt_species_count <- dplyr::bind_rows(
  list(
    specialist = dplyr::count(specialist, SpeciesName),
    generalist = dplyr::count(generalist, SpeciesName)
  ),
  .id = "category"
) %>% 
  dplyr::mutate(
    category = forcats::fct_relevel(category, "specialist")
  ) %>% 
  ggplot2::ggplot(
    mapping = aes(y = forcats::fct_rev(SpeciesName), x = n, fill = category)
  ) +
  ggplot2::labs(
    x = "Unique prophage signatures", y = NULL
  ) +
  ggplot2::geom_col(position = "stack", color = "black") +
  ggplot2::scale_x_continuous(
    expand = expansion(mult = c(0, 0.05))
  ) +
  scale_fill_manual(
    values = c("black", "white"),
    breaks = c("generalist", "specialist")
  ) +
  theme_bw(base_size = 18) +
  theme(
    legend.position = "bottom",
    axis.text.y = element_text(face = "italic"),
    axis.ticks.y = element_blank(),
    panel.grid = element_blank()
  )

Number of generalist targeting two species (except carotovoricin)

Code
sp_vs_sp_grid <- tidyr::expand_grid(
  sp1 = speciesOrder$SpeciesName,
  sp2 = speciesOrder$SpeciesName
) %>% 
  dplyr::filter(sp1 != sp2)

phage_common_targets <- split(
  x = as.character(generalist$SpeciesName),
  f = as.character(generalist$phage_grp)
) %>% 
  purrr::discard_at(at = "phage_grp_1") %>% 
  purrr::map(
    .f = function(x){
      tidyr::expand_grid(sp1 = x, sp2 = x) %>% 
        dplyr::filter(sp1 != sp2)
    }
  ) %>% 
  purrr::list_rbind() %>% 
  dplyr::count(sp1, sp2) %>% 
  dplyr::right_join(y = sp_vs_sp_grid, by = c("sp1", "sp2")) %>% 
  dplyr::mutate(
    dplyr::across(
      .cols = c(sp1, sp2),
      .fns = ~forcats::fct_relevel(.x, !!speciesOrder$SpeciesName)
    )
  ) %>% 
  dplyr::filter(sp1 != sp2) %>% 
  tidyr::replace_na(replace = list(n = 0))


pt_phage_overlap <- ggplot2::ggplot(
  data = phage_common_targets,
  mapping = aes(x = sp1, y = forcats::fct_rev(sp2), size = n)
) +
  ggplot2::geom_point(color = "black", fill = "black", shape = 21) +
  ggplot2::scale_size_continuous(
    name = "Shared prophage signatures",
    range = c(2, 10),
    limits = c(1, NA),
    breaks = c(1, 3, 5, 7, 11)
  ) +
  theme_bw(base_size = 18) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.text = element_text(face = "italic"),
    legend.position = "bottom",
    axis.title = element_blank()
  )
Warning: Removed 320 rows containing missing values (`geom_point()`).
Removed 320 rows containing missing values (`geom_point()`).

Prophage signature overlap across two or more species

Code
pt_dot <- ggplot2::ggplot(data = generalist) + 
  ggplot2::geom_point(
    mapping = aes(x = phage_grp, y = forcats::fct_rev(SpeciesName)),
    size = 3
  ) +
  ggplot2::labs(
    y = NULL, x = "Prophage clusters"
  ) +
  theme_bw(base_size = 16) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.text.y = element_text(face = "italic"),
    legend.position = "bottom"
  )

Generalist prophages target upset plot

Code
cm <- ComplexHeatmap::make_comb_mat(
  split(x = clustWiseCounts$phage_grp, f = clustWiseCounts$SpeciesName)
)

phage_upset <- ComplexHeatmap::UpSet(
  m = cm,
  column_title = "Prophage across species",
  row_names_gp = gpar(fontface = "italic"),
  column_title_gp = gpar(fontface = "bold", fontsize = 16),
  set_order = speciesOrder$SpeciesName,
  right_annotation = upset_right_annotation(cm, add_numbers = TRUE)
)
quartz_off_screen 
                2